This code covers chapter 2 of “Introduction to Data Mining” by Pang-Ning Tan, Michael Steinbach and Vipin Kumar.

CC This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.

Preparation

Load the iris data set (set of records as a data.frame)

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

Data Quality

Inspect data (plot for data.frames actually uses pairs plot). Possibly you can see noise and ouliers.

plot(iris, col=iris$Species)

Get summary statistics for each column (outliers, missing values)

summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Are there duplicate entries?

i <- duplicated(iris)
i
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Which object is a duplicate?

which(i)
## [1] 143
iris[i,]
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 143          5.8         2.7          5.1         1.9 virginica

See also ? unique and ? complete.cases: Often you will do something like:

clean.data <- unique(iris[complete.cases(iris),])
summary(clean.data)
##   Sepal.Length    Sepal.Width    Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.00   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.80   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.00   Median :4.300   Median :1.300  
##  Mean   :5.844   Mean   :3.06   Mean   :3.749   Mean   :1.195  
##  3rd Qu.:6.400   3rd Qu.:3.30   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.40   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :49  
##                 
##                 
## 

Note that one case (non-unique) is gone. All cases with missing values will also have been removed.

Aggregation

Aggregate by species (using mean or median)

aggregate(. ~ Species, data = iris, FUN = mean)
##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa        5.006       3.428        1.462       0.246
## 2 versicolor        5.936       2.770        4.260       1.326
## 3  virginica        6.588       2.974        5.552       2.026
aggregate(. ~ Species, data = iris, FUN = median)
##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa          5.0         3.4         1.50         0.2
## 2 versicolor          5.9         2.8         4.35         1.3
## 3  virginica          6.5         3.0         5.55         2.0

Uses the formula interface . ~ Species means all (.) depending on feature Species.

Sampling

Random sampling

id <- sample(1:nrow(iris), 20)
id
##  [1]  51 109  57  10 145 100  58 112 147  16  63  47 114  45  39 117 129
## [18] 120  14 140
s <- iris[id,]
plot(s, col=s$Species)

Stratified sampling

You need to install the package sampling with: install.packages(“sampling”)

library(sampling)
id2 <- strata(iris, stratanames="Species", size=c(5,5,5), method="srswor")
id2
##        Species ID_unit Prob Stratum
## 1       setosa       1  0.1       1
## 3       setosa       3  0.1       1
## 21      setosa      21  0.1       1
## 25      setosa      25  0.1       1
## 42      setosa      42  0.1       1
## 57  versicolor      57  0.1       2
## 65  versicolor      65  0.1       2
## 66  versicolor      66  0.1       2
## 81  versicolor      81  0.1       2
## 85  versicolor      85  0.1       2
## 112  virginica     112  0.1       3
## 117  virginica     117  0.1       3
## 129  virginica     129  0.1       3
## 135  virginica     135  0.1       3
## 143  virginica     143  0.1       3
s2 <- iris[id2$ID_unit,]
plot(s2, col=s2$Species)

Features

Dimensionality reduction (Principal Components Analysis - PCA)

Look at data first

library(scatterplot3d)
scatterplot3d(iris[,1:3], color=as.integer(iris$Species))

Intereactive 3d plots (needs package rgl)

#library(rgl)
#plot3d(as.matrix(iris[,1:3]), col=as.integer(iris$Species), size=5)

Intereactive 3d plots (needs package plotly)

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:graphics':
## 
##     layout
plot_ly(iris, x = Sepal.Length, y= Petal.Length, z = Sepal.Width,
  size = Petal.Width, color = Species, type="scatter3d", mode="markers")

Calculate the principal components

pc <- prcomp(as.matrix(iris[,1:4]))

How important is each principal component?

plot(pc)

Inspect the raw object (display structure)

str(pc)
## List of 5
##  $ sdev    : num [1:4] 2.056 0.493 0.28 0.154
##  $ rotation: num [1:4, 1:4] 0.3614 -0.0845 0.8567 0.3583 -0.6566 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  $ center  : Named num [1:4] 5.84 3.06 3.76 1.2
##   ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  $ scale   : logi FALSE
##  $ x       : num [1:150, 1:4] -2.68 -2.71 -2.89 -2.75 -2.73 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:4] "PC1" "PC2" "PC3" "PC4"
##  - attr(*, "class")= chr "prcomp"
plot(pc$x, col=iris$Species) # plot the first 2 principal components

Plot the projected data and add the original dimensions as arrows

biplot(pc, col = c("grey", "red"))

Feature selection

We will talk about feature selection when we discuss classification models.

Discretrize features

plot(iris$Petal.Width, 1:150, ylab="index")

A histogram is a better visualization for the distribution of a single variable.

hist(iris$Petal.Width)

Equal interval width

cut(iris$Sepal.Width, breaks=3)
##   [1] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (3.6,4.4] (2.8,3.6]
##   [8] (2.8,3.6] (2.8,3.6] (2.8,3.6] (3.6,4.4] (2.8,3.6] (2.8,3.6] (2.8,3.6]
##  [15] (3.6,4.4] (3.6,4.4] (3.6,4.4] (2.8,3.6] (3.6,4.4] (3.6,4.4] (2.8,3.6]
##  [22] (3.6,4.4] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6]
##  [29] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (3.6,4.4] (3.6,4.4] (2.8,3.6]
##  [36] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]  
##  [43] (2.8,3.6] (2.8,3.6] (3.6,4.4] (2.8,3.6] (3.6,4.4] (2.8,3.6] (3.6,4.4]
##  [50] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]  
##  [57] (2.8,3.6] (2,2.8]   (2.8,3.6] (2,2.8]   (2,2.8]   (2.8,3.6] (2,2.8]  
##  [64] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]  
##  [71] (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]   (2.8,3.6] (2.8,3.6] (2,2.8]  
##  [78] (2.8,3.6] (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]   (2,2.8]   (2,2.8]  
##  [85] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]   (2.8,3.6] (2,2.8]   (2,2.8]  
##  [92] (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]   (2.8,3.6] (2.8,3.6] (2.8,3.6]
##  [99] (2,2.8]   (2,2.8]   (2.8,3.6] (2,2.8]   (2.8,3.6] (2.8,3.6] (2.8,3.6]
## [106] (2.8,3.6] (2,2.8]   (2.8,3.6] (2,2.8]   (2.8,3.6] (2.8,3.6] (2,2.8]  
## [113] (2.8,3.6] (2,2.8]   (2,2.8]   (2.8,3.6] (2.8,3.6] (3.6,4.4] (2,2.8]  
## [120] (2,2.8]   (2.8,3.6] (2,2.8]   (2,2.8]   (2,2.8]   (2.8,3.6] (2.8,3.6]
## [127] (2,2.8]   (2.8,3.6] (2,2.8]   (2.8,3.6] (2,2.8]   (3.6,4.4] (2,2.8]  
## [134] (2,2.8]   (2,2.8]   (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6] (2.8,3.6]
## [141] (2.8,3.6] (2.8,3.6] (2,2.8]   (2.8,3.6] (2.8,3.6] (2.8,3.6] (2,2.8]  
## [148] (2.8,3.6] (2.8,3.6] (2.8,3.6]
## Levels: (2,2.8] (2.8,3.6] (3.6,4.4]

Other methods (equal frequency, k-means clustering, etc.)

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## 
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
discretize(iris$Petal.Width, method="interval", categories=3)
##   [1] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##   [8] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [15] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [22] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [29] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [36] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [43] [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9) [0.1,0.9)
##  [50] [0.1,0.9) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [57] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [64] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [71] [1.7,2.5] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [78] [1.7,2.5] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [85] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [92] [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7) [0.9,1.7)
##  [99] [0.9,1.7) [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [106] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [113] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [120] [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [127] [1.7,2.5] [1.7,2.5] [1.7,2.5] [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [134] [0.9,1.7) [0.9,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [141] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [148] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## Levels: [0.1,0.9) [0.9,1.7) [1.7,2.5]
discretize(iris$Petal.Width, method="frequency", categories=3)
##   [1] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
##   [8] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
##  [15] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
##  [22] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
##  [29] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
##  [36] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
##  [43] [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0) [0.1,1.0)
##  [50] [0.1,1.0) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
##  [57] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
##  [64] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
##  [71] [1.7,2.5] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
##  [78] [1.7,2.5] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
##  [85] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
##  [92] [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7) [1.0,1.7)
##  [99] [1.0,1.7) [1.0,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [106] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [113] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [120] [1.0,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [127] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.0,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [134] [1.0,1.7) [1.0,1.7) [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [141] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## [148] [1.7,2.5] [1.7,2.5] [1.7,2.5]
## Levels: [0.1,1.0) [1.0,1.7) [1.7,2.5]
discretize(iris$Petal.Width, method="cluster", categories=3)
##   [1] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
##   [6] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
##  [11] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
##  [16] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
##  [21] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
##  [26] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
##  [31] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
##  [36] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
##  [41] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
##  [46] [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785) [0.100,0.785)
##  [51] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
##  [56] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
##  [61] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
##  [66] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
##  [71] [1.691,2.500] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
##  [76] [0.785,1.691) [0.785,1.691) [1.691,2.500] [0.785,1.691) [0.785,1.691)
##  [81] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
##  [86] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
##  [91] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
##  [96] [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691) [0.785,1.691)
## [101] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [106] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [111] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [116] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [0.785,1.691)
## [121] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [126] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [0.785,1.691)
## [131] [1.691,2.500] [1.691,2.500] [1.691,2.500] [0.785,1.691) [0.785,1.691)
## [136] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [141] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## [146] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500] [1.691,2.500]
## Levels: [0.100,0.785) [0.785,1.691) [1.691,2.500]
hist(iris$Petal.Width,
  main = "Discretization: interval", sub = "Blue lines are boundaries")
abline(v=discretize(iris$Petal.Width, method="interval",
  categories=3,onlycuts=TRUE), col="blue")

hist(iris$Petal.Width,
  main = "Discretization: frequency", sub = "Blue lines are boundaries")
abline(v=discretize(iris$Petal.Width, method="frequency",
  categories=3,onlycuts=TRUE), col="blue")

hist(iris$Petal.Width,
  main = "Discretization: cluster", sub = "Blue lines are boundaries")
abline(v=discretize(iris$Petal.Width, method="cluster",
  categories=3,onlycuts=TRUE), col="blue")

Standardize data (Z-score)

Standardize the scale of features to make them comparable. For each column the mean is subtracted (centering) and it is divided by the standard deviation (scaling). Now most values should be in [-3,3].

iris.scaled <- scale(iris[1:4])
head(iris.scaled)
##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   -0.8976739  1.01560199    -1.335752   -1.311052
## [2,]   -1.1392005 -0.13153881    -1.335752   -1.311052
## [3,]   -1.3807271  0.32731751    -1.392399   -1.311052
## [4,]   -1.5014904  0.09788935    -1.279104   -1.311052
## [5,]   -1.0184372  1.24503015    -1.335752   -1.311052
## [6,]   -0.5353840  1.93331463    -1.165809   -1.048667
summary(iris.scaled)
##   Sepal.Length       Sepal.Width       Petal.Length      Petal.Width     
##  Min.   :-1.86378   Min.   :-2.4258   Min.   :-1.5623   Min.   :-1.4422  
##  1st Qu.:-0.89767   1st Qu.:-0.5904   1st Qu.:-1.2225   1st Qu.:-1.1799  
##  Median :-0.05233   Median :-0.1315   Median : 0.3354   Median : 0.1321  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.67225   3rd Qu.: 0.5567   3rd Qu.: 0.7602   3rd Qu.: 0.7880  
##  Max.   : 2.48370   Max.   : 3.0805   Max.   : 1.7799   Max.   : 1.7064

Proximities: Similarities and distances

Note: R actually only uses dissimilarities/distances.

Minkovsky distances

iris.scaled[1:5,]
##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   -0.8976739  1.01560199    -1.335752   -1.311052
## [2,]   -1.1392005 -0.13153881    -1.335752   -1.311052
## [3,]   -1.3807271  0.32731751    -1.392399   -1.311052
## [4,]   -1.5014904  0.09788935    -1.279104   -1.311052
## [5,]   -1.0184372  1.24503015    -1.335752   -1.311052

Calculate distances matrices between the first 5 flowers (use only the 4 numeric columns).

dist(iris.scaled[1:5,], method="euclidean")
##           1         2         3         4
## 2 1.1722914                              
## 3 0.8427840 0.5216255                    
## 4 1.0999999 0.4325508 0.2829432          
## 5 0.2592702 1.3818560 0.9882608 1.2459861
dist(iris.scaled[1:5,], method="manhattan")
##           1         2         3         4
## 2 1.3886674                              
## 3 1.2279853 0.7570306                    
## 4 1.5781768 0.6483657 0.4634868          
## 5 0.3501915 1.4973323 1.3366502 1.6868417
dist(iris.scaled[1:5,], method="maximum")
##           1         2         3         4
## 2 1.1471408                              
## 3 0.6882845 0.4588563                    
## 4 0.9177126 0.3622899 0.2294282          
## 5 0.2294282 1.3765690 0.9177126 1.1471408

Note: Don’t forget to scale the data if the ranges are very different!

Distances for binary data (Jaccard and Hamming)

b <- rbind(
  c(0,0,0,1,1,1,1,0,0,1),
  c(0,0,1,1,1,0,0,1,0,0)
  )
b
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]    0    0    0    1    1    1    1    0    0     1
## [2,]    0    0    1    1    1    0    0    1    0     0

Jaccard index

Jaccard index is a similarity measure so R reports 1-Jaccard

dist(b, method="binary")
##           1
## 2 0.7142857

Hamming distance

Hamming distance is the number of mis-matches

dist(b, method="manhattan")
##   1
## 2 5

Gower’s distance

Works with mixed data

data <- data.frame(
  height= c(      160,    185,    170),
  weight= c(       52,     90,     75),
  sex=    c( "female", "male", "male")
)
data
##   height weight    sex
## 1    160     52 female
## 2    185     90   male
## 3    170     75   male
library(proxy)
## 
## Attaching package: 'proxy'
## 
## The following object is masked from 'package:Matrix':
## 
##     as.matrix
## 
## The following objects are masked from 'package:stats':
## 
##     as.dist, dist
## 
## The following object is masked from 'package:base':
## 
##     as.matrix
dist(data, method="Gower")
##           1         2
## 2 1.0000000          
## 3 0.6684211 0.3315789

Additional proximity measures available in package proxy

library(proxy)
names(pr_DB$get_entries())
##  [1] "Jaccard"         "Kulczynski1"     "Kulczynski2"    
##  [4] "Mountford"       "Fager"           "Russel"         
##  [7] "simple matching" "Hamman"          "Faith"          
## [10] "Tanimoto"        "Dice"            "Phi"            
## [13] "Stiles"          "Michael"         "Mozley"         
## [16] "Yule"            "Yule2"           "Ochiai"         
## [19] "Simpson"         "Braun-Blanquet"  "cosine"         
## [22] "eJaccard"        "correlation"     "Chi-squared"    
## [25] "Phi-squared"     "Tschuprow"       "Cramer"         
## [28] "Pearson"         "Gower"           "Euclidean"      
## [31] "Mahalanobis"     "Bhjattacharyya"  "Manhattan"      
## [34] "supremum"        "Minkowski"       "Canberra"       
## [37] "Wave"            "divergence"      "Kullback"       
## [40] "Bray"            "Soergel"         "Levenshtein"    
## [43] "Podani"          "Chord"           "Geodesic"       
## [46] "Whittaker"       "Hellinger"       "fJaccard"

Relationship between features

Correlation (for ratio/interval scaled features)

Pearson correlation between features (columns)

cor(iris[,1:4])
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000
plot(iris$Petal.Length, iris$Petal.Width)

cor(iris$Petal.Length, iris$Petal.Width)
## [1] 0.9628654
cor.test(iris$Petal.Length, iris$Petal.Width)
## 
##  Pearson's product-moment correlation
## 
## data:  iris$Petal.Length and iris$Petal.Width
## t = 43.387, df = 148, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9490525 0.9729853
## sample estimates:
##       cor 
## 0.9628654
plot(iris$Sepal.Length, iris$Sepal.Width)

cor(iris$Sepal.Length, iris$Sepal.Width)
## [1] -0.1175698
cor.test(iris$Sepal.Length, iris$Sepal.Width)
## 
##  Pearson's product-moment correlation
## 
## data:  iris$Sepal.Length and iris$Sepal.Width
## t = -1.4403, df = 148, p-value = 0.1519
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.27269325  0.04351158
## sample estimates:
##        cor 
## -0.1175698

Correlation between objects (transpose matrix first)

cc <- cor(t(iris[,1:4]))
dim(cc)
## [1] 150 150
cc[1:10,1:10]
##            [,1]      [,2]      [,3]      [,4]      [,5]      [,6]
##  [1,] 1.0000000 0.9959987 0.9999739 0.9981685 0.9993473 0.9995861
##  [2,] 0.9959987 1.0000000 0.9966071 0.9973966 0.9922327 0.9935919
##  [3,] 0.9999739 0.9966071 1.0000000 0.9983335 0.9990611 0.9993773
##  [4,] 0.9981685 0.9973966 0.9983335 1.0000000 0.9967188 0.9978326
##  [5,] 0.9993473 0.9922327 0.9990611 0.9967188 1.0000000 0.9998833
##  [6,] 0.9995861 0.9935919 0.9993773 0.9978326 0.9998833 1.0000000
##  [7,] 0.9988112 0.9907206 0.9984377 0.9961394 0.9999140 0.9997226
##  [8,] 0.9995381 0.9971181 0.9996045 0.9995456 0.9985032 0.9991788
##  [9,] 0.9980766 0.9985463 0.9983561 0.9998333 0.9960309 0.9972157
## [10,] 0.9965520 0.9990329 0.9969856 0.9993068 0.9937612 0.9952606
##            [,7]      [,8]      [,9]     [,10]
##  [1,] 0.9988112 0.9995381 0.9980766 0.9965520
##  [2,] 0.9907206 0.9971181 0.9985463 0.9990329
##  [3,] 0.9984377 0.9996045 0.9983561 0.9969856
##  [4,] 0.9961394 0.9995456 0.9998333 0.9993068
##  [5,] 0.9999140 0.9985032 0.9960309 0.9937612
##  [6,] 0.9997226 0.9991788 0.9972157 0.9952606
##  [7,] 1.0000000 0.9979521 0.9952140 0.9927272
##  [8,] 0.9979521 1.0000000 0.9994062 0.9983737
##  [9,] 0.9952140 0.9994062 1.0000000 0.9997398
## [10,] 0.9927272 0.9983737 0.9997398 1.0000000
library("seriation") # for pimage
pimage(cc, main = "Correlation between objects")

Convert correlations into a dissimilarities

d <- as.dist(1-abs(cc))
pimage(d, main = "Dissimilaries between objects")

Rank correlation (for ordinal features)

convert to ordinal variables with cut (see ? cut) into ordered factors with three levels

iris_ord <- data.frame(
  cut(iris[,1], 3, labels=c("short", "medium", "long"), ordered=T),
  cut(iris[,2], 3, labels=c("short", "medium", "long"), ordered=T),
  cut(iris[,3], 3, labels=c("short", "medium", "long"), ordered=T),
  cut(iris[,4], 3, labels=c("short", "medium", "long"), ordered=T),
  iris[,5])
colnames(iris_ord) <- colnames(iris)
summary(iris_ord)
##  Sepal.Length Sepal.Width Petal.Length Petal.Width       Species  
##  short :59    short :47   short :50    short :50   setosa    :50  
##  medium:71    medium:88   medium:54    medium:54   versicolor:50  
##  long  :20    long  :15   long  :46    long  :46   virginica :50
head(iris_ord$Sepal.Length)
## [1] short short short short short short
## Levels: short < medium < long

Kendall’s tau rank correlation coefficient

cor(sapply(iris_ord[,1:4], xtfrm), method="kendall")
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1437985    0.7418595   0.7295139
## Sepal.Width    -0.1437985   1.0000000   -0.3298796  -0.3154474
## Petal.Length    0.7418595  -0.3298796    1.0000000   0.9198290
## Petal.Width     0.7295139  -0.3154474    0.9198290   1.0000000

Spearman’s rho

cor(sapply(iris_ord[,1:4], xtfrm), method="spearman")
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1569659    0.7937613   0.7843406
## Sepal.Width    -0.1569659   1.0000000   -0.3662775  -0.3517262
## Petal.Length    0.7937613  -0.3662775    1.0000000   0.9399038
## Petal.Width     0.7843406  -0.3517262    0.9399038   1.0000000

Note: unfortunately we have to transform the ordered factors into numbers representing the order with xtfrm first.

Compare to the Pearson correlation on the original data

cor(iris[,1:4])
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000

Relationship between nominal and ordinal features

Is sepal length and species related? Use cross tabulation

tbl <- table(Sepal.Length=iris_ord$Sepal.Length, iris_ord$Species)
tbl
##             
## Sepal.Length setosa versicolor virginica
##       short      47         11         1
##       medium      3         36        32
##       long        0          3        17

Test of Independence: Pearson’s chi-squared test is performed with the null hypothesis that the joint distribution of the cell counts in a 2-dimensional contingency table is the product of the row and column marginals. (h0 is independence)

chisq.test(tbl)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 111.63, df = 4, p-value < 2.2e-16

Using xtabs instead

x <- xtabs(~Sepal.Length + Species, data=iris_ord)
x
##             Species
## Sepal.Length setosa versicolor virginica
##       short      47         11         1
##       medium      3         36        32
##       long        0          3        17
summary(x)
## Call: xtabs(formula = ~Sepal.Length + Species, data = iris_ord)
## Number of cases in table: 150 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 111.63, df = 4, p-value = 3.262e-23

Groupwise averages

aggregate(Sepal.Length ~ Species, data=iris, FUN = mean)
##      Species Sepal.Length
## 1     setosa        5.006
## 2 versicolor        5.936
## 3  virginica        6.588
aggregate(Sepal.Width ~ Species, data=iris, FUN = mean)
##      Species Sepal.Width
## 1     setosa       3.428
## 2 versicolor       2.770
## 3  virginica       2.974

Density estimation

Just plotting the data is not very helpful

plot(iris$Petal.Length, jitter(rep(1, nrow(iris))), ylab ="", yaxt = "n")

Histograms work better

hist(iris$Petal.Length)
rug(iris$Petal.Length)

We can also add a kernel density estimate KDE (red line)

hist(iris$Petal.Length, freq=FALSE)
rug(iris$Petal.Length)
lines(density(iris$Petal.Length), col="red", lwd = 2)

Plot 2d kernel density estimate

library(MASS)
dens <- kde2d(iris$Sepal.Length, iris$Sepal.Width, n=100)

image(dens,
    xlab="Sepal.Length", ylab="Sepal.Width")
contour(dens, add=TRUE)
points(jitter(iris$Sepal.Length, 2), jitter(iris$Sepal.Width, 2),
  cex=.7, pch="+")

Use a 3d plot instead

persp(dens, xlab="Sepal.Length", ylab="Sepal.Width", zlab="density",
  shade=.5)

# Use plotly
#library(plotly)
#plot_ly(dens, z = z, type = "surface")